In [ ]:
## Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.style.use('dark_background')
import seaborn as sns
color = sns.color_palette()
from matplotlib.colors import ListedColormap
from scipy.stats import norm, boxcox
from scipy import stats
from tqdm import tqdm_notebook
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix, r2_score, accuracy_score
from sklearn.model_selection import (GridSearchCV, KFold, train_test_split, cross_val_score)
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import svm
from xgboost.sklearn import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')
In [2]:
## Getting the dataset
df= pd. read_csv("E:/URI Materials/3rd Semester/STA 500/Project/Water_Probability/water_potability.csv")
df["Potability"]=df["Potability"].astype('category')
In [5]:
df.head(20)
Out[5]:
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 204.890456 | 20791.31898 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | 0 |
| 1 | 3.716080 | 129.422921 | 18630.05786 | 6.635246 | NaN | 592.885359 | 15.180013 | 56.329076 | 4.500656 | 0 |
| 2 | 8.099124 | 224.236259 | 19909.54173 | 9.275884 | NaN | 418.606213 | 16.868637 | 66.420093 | 3.055934 | 0 |
| 3 | 8.316766 | 214.373394 | 22018.41744 | 8.059332 | 356.886136 | 363.266516 | 18.436525 | 100.341674 | 4.628771 | 0 |
| 4 | 9.092223 | 181.101509 | 17978.98634 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 31.997993 | 4.075075 | 0 |
| 5 | 5.584087 | 188.313324 | 28748.68774 | 7.544869 | 326.678363 | 280.467916 | 8.399735 | 54.917862 | 2.559708 | 0 |
| 6 | 10.223862 | 248.071735 | 28749.71654 | 7.513408 | 393.663395 | 283.651634 | 13.789695 | 84.603556 | 2.672989 | 0 |
| 7 | 8.635849 | 203.361523 | 13672.09176 | 4.563009 | 303.309771 | 474.607645 | 12.363817 | 62.798309 | 4.401425 | 0 |
| 8 | NaN | 118.988579 | 14285.58385 | 7.804174 | 268.646941 | 389.375566 | 12.706049 | 53.928846 | 3.595017 | 0 |
| 9 | 11.180284 | 227.231469 | 25484.50849 | 9.077200 | 404.041635 | 563.885481 | 17.927806 | 71.976601 | 4.370562 | 0 |
| 10 | 7.360640 | 165.520797 | 32452.61441 | 7.550701 | 326.624353 | 425.383420 | 15.586810 | 78.740016 | 3.662292 | 0 |
| 11 | 7.974522 | 218.693300 | 18767.65668 | 8.110385 | NaN | 364.098230 | 14.525746 | 76.485911 | 4.011718 | 0 |
| 12 | 7.119824 | 156.704993 | 18730.81365 | 3.606036 | 282.344050 | 347.715027 | 15.929536 | 79.500778 | 3.445756 | 0 |
| 13 | NaN | 150.174923 | 27331.36196 | 6.838223 | 299.415781 | 379.761835 | 19.370807 | 76.509996 | 4.413974 | 0 |
| 14 | 7.496232 | 205.344982 | 28388.00489 | 5.072558 | NaN | 444.645352 | 13.228311 | 70.300213 | 4.777382 | 0 |
| 15 | 6.347272 | 186.732881 | 41065.23476 | 9.629596 | 364.487687 | 516.743282 | 11.539781 | 75.071617 | 4.376348 | 0 |
| 16 | 7.051786 | 211.049406 | 30980.60079 | 10.094796 | NaN | 315.141267 | 20.397022 | 56.651604 | 4.268429 | 0 |
| 17 | 9.181560 | 273.813807 | 24041.32628 | 6.904990 | 398.350517 | 477.974642 | 13.387341 | 71.457362 | 4.503661 | 0 |
| 18 | 8.975464 | 279.357167 | 19460.39813 | 6.204321 | NaN | 431.443990 | 12.888759 | 63.821237 | 2.436086 | 0 |
| 19 | 7.371050 | 214.496611 | 25630.32004 | 4.432669 | 335.754439 | 469.914552 | 12.509164 | 62.797277 | 2.560299 | 0 |
In [3]:
df.describe().T.style.background_gradient(subset=['mean','std','50%','count'], cmap='PuBu')
Out[3]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| ph | 2785.000000 | 7.080795 | 1.594320 | 0.000000 | 6.093092 | 7.036752 | 8.062066 | 14.000000 |
| Hardness | 3276.000000 | 196.369496 | 32.879761 | 47.432000 | 176.850538 | 196.967627 | 216.667456 | 323.124000 |
| Solids | 3276.000000 | 22014.092526 | 8768.570828 | 320.942611 | 15666.690300 | 20927.833605 | 27332.762125 | 61227.196010 |
| Chloramines | 3276.000000 | 7.122277 | 1.583085 | 0.352000 | 6.127421 | 7.130299 | 8.114887 | 13.127000 |
| Sulfate | 2495.000000 | 333.775777 | 41.416840 | 129.000000 | 307.699498 | 333.073546 | 359.950170 | 481.030642 |
| Conductivity | 3276.000000 | 426.205111 | 80.824064 | 181.483754 | 365.734414 | 421.884968 | 481.792305 | 753.342620 |
| Organic_carbon | 3276.000000 | 14.284970 | 3.308162 | 2.200000 | 12.065801 | 14.218338 | 16.557652 | 28.300000 |
| Trihalomethanes | 3114.000000 | 66.396293 | 16.175008 | 0.738000 | 55.844536 | 66.622485 | 77.337473 | 124.000000 |
| Turbidity | 3276.000000 | 3.966786 | 0.780382 | 1.450000 | 3.439711 | 3.955028 | 4.500320 | 6.739000 |
In [4]:
## Missing data in dataset
Miss = df.isna().sum().sort_values(ascending=False).to_frame()
Miss=100*Miss/3276
Miss
Out[4]:
| 0 | |
|---|---|
| Sulfate | 23.840049 |
| ph | 14.987790 |
| Trihalomethanes | 4.945055 |
| Hardness | 0.000000 |
| Solids | 0.000000 |
| Chloramines | 0.000000 |
| Conductivity | 0.000000 |
| Organic_carbon | 0.000000 |
| Turbidity | 0.000000 |
| Potability | 0.000000 |
In [6]:
plt.title('Missing Values Per Feature')
Miss = df.isna().sum().sort_values(ascending=False).to_frame()
sns.heatmap(Miss,annot=True,fmt='d',cmap='vlag')
Out[6]:
<Axes: title={'center': 'Missing Values Per Feature'}>
In [7]:
Corrmat = df.corr()
plt.subplots(figsize=(7,7))
sns.heatmap(Corrmat, cmap="YlGnBu", square = True, annot=True, fmt='.2f')
plt.show()
In [8]:
df.columns
Out[8]:
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
dtype='object')
In [9]:
sns.kdeplot(df["Turbidity"])
Out[9]:
<Axes: xlabel='Turbidity', ylabel='Density'>
In [ ]:
In [10]:
sns.pairplot(df, hue="Potability")
Out[10]:
<seaborn.axisgrid.PairGrid at 0x22aadd04050>
Dropping the missing values and checking the performance of different models without imputations.¶
In [11]:
df_c=df.dropna()
In [12]:
## Dividing Predictors and Response
X, y = df_c.iloc[:,:-1], df_c.iloc[:,-1]
X_train, X_test, y_train, y_test= train_test_split(X, y,
test_size=0.3, random_state=123)
model = [LogisticRegression(), DecisionTreeClassifier(max_depth=6), GaussianNB(), RandomForestClassifier(),
XGBClassifier()]
trainAccuracy = list()
testAccuracy = list()
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
for mdl in model:
trainResult = cross_val_score(mdl, X_train, y_train, scoring='accuracy', cv=kfold)
trainAccuracy.append(trainResult.mean())
mdl.fit(X_train, y_train)
y_pred = mdl.predict(X_test)
testResult = metrics.accuracy_score(y_test, y_pred)
testAccuracy.append(testResult)
print('The comparision\n')
modelScore = pd.DataFrame({'Model' : model, 'Train_Accuracy' : trainAccuracy, 'Test_Accuracy' : testAccuracy})
modelScore
The comparision
Out[12]:
| Model | Train_Accuracy | Test_Accuracy | |
|---|---|---|---|
| 0 | LogisticRegression() | 0.593435 | 0.597682 |
| 1 | DecisionTreeClassifier(max_depth=6) | 0.620481 | 0.644040 |
| 2 | GaussianNB() | 0.617518 | 0.617550 |
| 3 | (DecisionTreeClassifier(max_features='sqrt', r... | 0.656008 | 0.673841 |
| 4 | XGBClassifier(base_score=None, booster=None, c... | 0.645350 | 0.649007 |
In [ ]:
In [ ]:
Imputing by mean but the dataset was divided in terms of response 0 or 1. Then check the performance of those models again.¶
In [13]:
df["Potability"]=df["Potability"].astype('category')
#################################### Imputing 'ph' value #####################################
phMean_0 = df[df['Potability'] == 0]['ph'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['ph'].isna()), 'ph'] = phMean_0
phMean_1 = df[df['Potability'] == 1]['ph'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['ph'].isna()), 'ph'] = phMean_1
##################################### Imputing 'Sulfate' value #####################################
SulfateMean_0 = df[df['Potability'] == 0]['Sulfate'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['Sulfate'].isna()), 'Sulfate'] = SulfateMean_0
SulfateMean_1 = df[df['Potability'] == 1]['Sulfate'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['Sulfate'].isna()), 'Sulfate'] = SulfateMean_1
################################ Imputing 'Trihalomethanes' value #####################################
TrihalomethanesMean_0 = df[df['Potability'] == 0]['Trihalomethanes'].mean(skipna=True)
df.loc[(df['Potability'] == 0) & (df['Trihalomethanes'].isna()), 'Trihalomethanes'] = TrihalomethanesMean_0
TrihalomethanesMean_1 = df[df['Potability'] == 1]['Trihalomethanes'].mean(skipna=True)
df.loc[(df['Potability'] == 1) & (df['Trihalomethanes'].isna()), 'Trihalomethanes'] = TrihalomethanesMean_1
In [ ]:
In [14]:
## Dividing Predictors and Response
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X_train, X_test, y_train, y_test= train_test_split(X, y,
test_size=0.3, random_state=123)
In [15]:
model = [LogisticRegression(), DecisionTreeClassifier(max_depth=6), GaussianNB(), RandomForestClassifier(),
svm.LinearSVC(), XGBClassifier()]
trainAccuracy = list()
testAccuracy = list()
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
for mdl in model:
trainResult = cross_val_score(mdl, X_train, y_train, scoring='accuracy', cv=kfold)
trainAccuracy.append(trainResult.mean())
mdl.fit(X_train, y_train)
y_pred = mdl.predict(X_test)
testResult = metrics.accuracy_score(y_test, y_pred)
testAccuracy.append(testResult)
In [16]:
print('The comparision\n')
modelScore = pd.DataFrame({'Model' : model, 'Train_Accuracy' : trainAccuracy, 'Test_Accuracy' : testAccuracy})
modelScore
The comparision
Out[16]:
| Model | Train_Accuracy | Test_Accuracy | |
|---|---|---|---|
| 0 | LogisticRegression() | 0.607458 | 0.617497 |
| 1 | DecisionTreeClassifier(max_depth=6) | 0.754914 | 0.730417 |
| 2 | GaussianNB() | 0.617946 | 0.617497 |
| 3 | (DecisionTreeClassifier(max_features='sqrt', r... | 0.798510 | 0.777213 |
| 4 | LinearSVC() | 0.540646 | 0.384537 |
| 5 | XGBClassifier(base_score=None, booster=None, c... | 0.778012 | 0.776195 |
In [17]:
df= pd. read_csv("E:/URI Materials/3rd Semester/STA 500/Project/Water_Probability/df_mice.csv")
df["Potability"]=df["Potability"].astype('category')
In [ ]:
In [18]:
## Dividing Predictors and Response
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X_train, X_test, y_train, y_test= train_test_split(X, y,
test_size=0.3, random_state=123)
model = [LogisticRegression(), DecisionTreeClassifier(max_depth=6), GaussianNB(), RandomForestClassifier(),
XGBClassifier()]
trainAccuracy = list()
testAccuracy = list()
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
for mdl in model:
trainResult = cross_val_score(mdl, X_train, y_train, scoring='accuracy', cv=kfold)
trainAccuracy.append(trainResult.mean())
mdl.fit(X_train, y_train)
y_pred = mdl.predict(X_test)
testResult = metrics.accuracy_score(y_test, y_pred)
testAccuracy.append(testResult)
print('The comparision\n')
modelScore = pd.DataFrame({'Model' : model, 'Train_Accuracy' : trainAccuracy, 'Test_Accuracy' : testAccuracy})
modelScore
The comparision
Out[18]:
| Model | Train_Accuracy | Test_Accuracy | |
|---|---|---|---|
| 0 | LogisticRegression() | 0.613617 | 0.603255 |
| 1 | DecisionTreeClassifier(max_depth=6) | 0.651559 | 0.634791 |
| 2 | GaussianNB() | 0.630646 | 0.621567 |
| 3 | (DecisionTreeClassifier(max_features='sqrt', r... | 0.685563 | 0.687691 |
| 4 | XGBClassifier(base_score=None, booster=None, c... | 0.665090 | 0.664293 |
In [21]:
df= pd. read_csv("E:/URI Materials/3rd Semester/STA 500/Project/Water_Probability/df_mice_cart.csv")
df["Potability"]=df["Potability"].astype('category')
## Dividing Predictors and Response
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X_train, X_test, y_train, y_test= train_test_split(X, y,
test_size=0.3, random_state=123)
model = [LogisticRegression(), DecisionTreeClassifier(max_depth=6), GaussianNB(), RandomForestClassifier(),
XGBClassifier()]
trainAccuracy = list()
testAccuracy = list()
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
for mdl in model:
trainResult = cross_val_score(mdl, X_train, y_train, scoring='accuracy', cv=kfold)
trainAccuracy.append(trainResult.mean())
mdl.fit(X_train, y_train)
y_pred = mdl.predict(X_test)
testResult = metrics.accuracy_score(y_test, y_pred)
testAccuracy.append(testResult)
print('The comparision\n')
modelScore = pd.DataFrame({'Model' : model, 'Train_Accuracy' : trainAccuracy, 'Test_Accuracy' : testAccuracy})
modelScore
The comparision
Out[21]:
| Model | Train_Accuracy | Test_Accuracy | |
|---|---|---|---|
| 0 | LogisticRegression() | 0.612307 | 0.602238 |
| 1 | DecisionTreeClassifier(max_depth=6) | 0.647203 | 0.652085 |
| 2 | GaussianNB() | 0.623659 | 0.631740 |
| 3 | (DecisionTreeClassifier(max_features='sqrt', r... | 0.705645 | 0.681587 |
| 4 | XGBClassifier(base_score=None, booster=None, c... | 0.683818 | 0.662258 |
In [20]:
df= pd. read_csv("E:/URI Materials/3rd Semester/STA 500/Project/Water_Probability/df_amelia.csv")
df["Potability"]=df["Potability"].astype('category')
## Dividing Predictors and Response
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X_train, X_test, y_train, y_test= train_test_split(X, y,
test_size=0.3, random_state=123)
model = [LogisticRegression(), DecisionTreeClassifier(max_depth=6), GaussianNB(), RandomForestClassifier(),
XGBClassifier()]
trainAccuracy = list()
testAccuracy = list()
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
for mdl in model:
trainResult = cross_val_score(mdl, X_train, y_train, scoring='accuracy', cv=kfold)
trainAccuracy.append(trainResult.mean())
mdl.fit(X_train, y_train)
y_pred = mdl.predict(X_test)
testResult = metrics.accuracy_score(y_test, y_pred)
testAccuracy.append(testResult)
print('The comparision\n')
modelScore = pd.DataFrame({'Model' : model, 'Train_Accuracy' : trainAccuracy, 'Test_Accuracy' : testAccuracy})
modelScore
The comparision
Out[20]:
| Model | Train_Accuracy | Test_Accuracy | |
|---|---|---|---|
| 0 | LogisticRegression() | 0.612307 | 0.602238 |
| 1 | DecisionTreeClassifier(max_depth=6) | 0.647638 | 0.651068 |
| 2 | GaussianNB() | 0.623659 | 0.631740 |
| 3 | (DecisionTreeClassifier(max_features='sqrt', r... | 0.700864 | 0.695829 |
| 4 | XGBClassifier(base_score=None, booster=None, c... | 0.683818 | 0.662258 |
In [23]:
df= pd. read_csv("E:/URI Materials/3rd Semester/STA 500/Project/Water_Probability/df_HDD.csv")
df["Potability"]=df["Potability"].astype('category')
## Dividing Predictors and Response
X, y = df.iloc[:,:-1], df.iloc[:,-1]
X_train, X_test, y_train, y_test= train_test_split(X, y,
test_size=0.3, random_state=123)
model = [LogisticRegression(), DecisionTreeClassifier(max_depth=6), GaussianNB(), RandomForestClassifier(),
XGBClassifier()]
trainAccuracy = list()
testAccuracy = list()
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
for mdl in model:
trainResult = cross_val_score(mdl, X_train, y_train, scoring='accuracy', cv=kfold)
trainAccuracy.append(trainResult.mean())
mdl.fit(X_train, y_train)
y_pred = mdl.predict(X_test)
testResult = metrics.accuracy_score(y_test, y_pred)
testAccuracy.append(testResult)
print('The comparision\n')
modelScore = pd.DataFrame({'Model' : model, 'Train_Accuracy' : trainAccuracy, 'Test_Accuracy' : testAccuracy})
modelScore
The comparision
Out[23]:
| Model | Train_Accuracy | Test_Accuracy | |
|---|---|---|---|
| 0 | LogisticRegression() | 0.612307 | 0.602238 |
| 1 | DecisionTreeClassifier(max_depth=6) | 0.646767 | 0.649034 |
| 2 | GaussianNB() | 0.623659 | 0.631740 |
| 3 | (DecisionTreeClassifier(max_features='sqrt', r... | 0.697807 | 0.692777 |
| 4 | XGBClassifier(base_score=None, booster=None, c... | 0.683818 | 0.662258 |
Conclusion:¶
We can identify that the performance of different classification models has improved significantly after using different imputation methods instead of just considering existinig( non-missing cases).